{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "4585157f-496c-4ea8-abf6-1f37357ada32",
   "metadata": {},
   "source": [
    "### Combining carrier counts from SNVs, Indels and SVs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "c5e519f9-62f8-40b0-8bf8-db05c3a519b8",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd \n",
    "from pathlib import Path"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "7d2479e0",
   "metadata": {},
   "outputs": [],
   "source": [
    "CHROMOSOMES = ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6',\n",
    "               'chr7', 'chr8', 'chr9', 'chr10', 'chr11', 'chr12',\n",
    "               'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18',\n",
    "               'chr19', 'chr20', 'chr21', 'chr22', 'chrX', 'chrY',\n",
    "               ]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "6d2206da",
   "metadata": {},
   "outputs": [],
   "source": [
    "### inputs  \n",
    "wkdir = \"/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/misexpression_v3/\"\n",
    "wkdir_path = Path(wkdir)\n",
    "\n",
    "sv_info_path = \"/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/lof_missense/data/sv_vcf/info_table/final_sites_critical_info_allele.txt\"\n",
    "sv_carrier_count_10kb_gene_body_dir = wkdir_path.joinpath(\"4_vrnt_enrich/sv_count_carriers/gene_body/10kb_window/carrier_count_gene_msc_reg_af50\")\n",
    "sv_carrier_count_200kb_gene_body_dir = wkdir_path.joinpath(\"4_vrnt_enrich/sv_count_carriers/gene_body/200kb_window/carrier_count_gene_msc_reg_af50\")\n",
    "sv_carrier_count_windows_dir = wkdir_path.joinpath(\"4_vrnt_enrich/sv_count_carriers/windows/200kb_window/carrier_count\")\n",
    "snp_indel_root_dir = wkdir_path.joinpath(\"4_vrnt_enrich/snp_indel_count_carriers/count_snp_indel_carriers_af50\")\n",
    "\n",
    "vep_msc_path = wkdir_path.joinpath(\"4_vrnt_enrich/sv_vep/msc/SV_vep_hg38_msc_parsed.tsv\")\n",
    "out_dir = wkdir_path.joinpath(\"4_vrnt_enrich/combine_count_carriers\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "af0a02e1-4498-4288-972b-363df1ae48ec",
   "metadata": {},
   "outputs": [],
   "source": [
    "out_dir_path = Path(out_dir)\n",
    "out_dir_path.mkdir(parents=True, exist_ok=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "1869ec61",
   "metadata": {},
   "outputs": [],
   "source": [
    "# list of SV classes including all SVs group \n",
    "sv_info_df = pd.read_csv(sv_info_path, sep=\"\\t\", dtype={\"plinkID\":str})\n",
    "sv_info_id_af_df = sv_info_df[[\"plinkID\", \"AF\", \"SVTYPE\"]].rename(columns={\"plinkID\":\"vrnt_id\"})\n",
    "sv_type_list = sv_info_id_af_df.SVTYPE.unique().tolist()\n",
    "all_sv_type_list = [\"all_sv\"] + sv_type_list\n",
    "\n",
    "# list of VEP consequences \n",
    "vep_msc_ranks = [\n",
    "    \"transcript_ablation\",\n",
    "    \"splice_acceptor_variant\",\n",
    "    \"splice_donor_variant\",\n",
    "    \"stop_gained\",\n",
    "    \"frameshift_variant\",\n",
    "    \"stop_lost\",\n",
    "    \"start_lost\",\n",
    "    \"transcript_amplification\",\n",
    "    \"inframe_insertion\",\n",
    "    \"inframe_deletion\",\n",
    "    \"missense_variant\",\n",
    "    \"protein_altering_variant\",\n",
    "    \"splice_region_variant\",\n",
    "    \"splice_donor_5th_base_variant\", \n",
    "    \"splice_donor_region_variant\", \n",
    "    \"splice_polypyrimidine_tract_variant\"\n",
    "    \"incomplete_terminal_codon_variant\",\n",
    "    \"start_retained_variant\",\n",
    "    \"stop_retained_variant\",\n",
    "    \"synonymous_variant\",\n",
    "    \"coding_sequence_variant\",\n",
    "    \"mature_miRNA_variant\",\n",
    "    \"5_prime_UTR_variant\",\n",
    "    \"3_prime_UTR_variant\",\n",
    "    \"non_coding_transcript_exon_variant\",\n",
    "    \"intron_variant\",\n",
    "    \"NMD_transcript_variant\",\n",
    "    \"non_coding_transcript_variant\",\n",
    "    \"upstream_gene_variant\",\n",
    "    \"downstream_gene_variant\",\n",
    "    \"TFBS_ablation\",\n",
    "    \"TFBS_amplification\",\n",
    "    \"TF_binding_site_variant\",\n",
    "    \"regulatory_region_ablation\",\n",
    "    \"regulatory_region_amplification\",\n",
    "    \"feature_elongation\",\n",
    "    \"regulatory_region_variant\",\n",
    "    \"feature_truncation\",\n",
    "    \"intergenic_variant\",\n",
    "    \"no_predicted_effect\"\n",
    "]\n",
    "\n",
    "carrier_count_cols= [\"misexp_carrier\", \"misexp_total\", \"control_carrier\", \"control_total\"]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e1b76767",
   "metadata": {},
   "source": [
    "### SV gene body carrier counts +/- 10kb"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "c1e2396c",
   "metadata": {},
   "outputs": [],
   "source": [
    "sv_carrier_count_10kb_gene_body_path = Path(sv_carrier_count_10kb_gene_body_dir)\n",
    "# combine control and misexpression count files from chromosomes \n",
    "carrier_count_df_list = []\n",
    "for chrom in CHROMOSOMES[:22]: \n",
    "    carrier_count_path = sv_carrier_count_10kb_gene_body_dir.joinpath(f\"{chrom}_carrier_count_gene_msc.tsv\")\n",
    "    carrier_count_df = pd.read_csv(carrier_count_path, sep=\"\\t\")\n",
    "    carrier_count_df_list.append(carrier_count_df)\n",
    "all_chrom_carrier_count_10kb_gene_body_df = pd.concat(carrier_count_df_list).drop(columns=[\"smpls_pass_qc\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "b5fff4b4",
   "metadata": {},
   "outputs": [],
   "source": [
    "window_raw = \"gene_body_10\"\n",
    "window_name = \"gene body +/-10kb\"\n",
    "sv_10kb_enrich_results_dict = {}\n",
    "sv_10kb_enrich_count = 0\n",
    "af_range_list = [[0,1], [1, 5], [5, 10], [10,50]]\n",
    "\n",
    "for af_range in af_range_list: \n",
    "    af_lower, af_upper = af_range\n",
    "    af_range_name = f\"{af_lower}-{af_upper}\"\n",
    "    carrier_count_maf_df = all_chrom_carrier_count_10kb_gene_body_df[all_chrom_carrier_count_10kb_gene_body_df.maf_bin == af_range_name]\n",
    "    # count all events for chromosomes \n",
    "    carrier_count_summed_df = carrier_count_maf_df.groupby(by=[\"z_cutoff\"], as_index=False).sum()\n",
    "    # calculate odds and risk ratio with confidence intervals \n",
    "    for z_cutoff in carrier_count_summed_df.z_cutoff:\n",
    "        name = f\"> {round(z_cutoff)}\"\n",
    "        # total misexpression and control events \n",
    "        total_misexp = carrier_count_summed_df[carrier_count_summed_df.z_cutoff == z_cutoff][f\"total_misexp\"].item()\n",
    "        total_control = carrier_count_summed_df[carrier_count_summed_df.z_cutoff == z_cutoff][f\"total_control\"].item()\n",
    "        for sv_type in all_sv_type_list: \n",
    "            # all SVs and different SV classes \n",
    "            misexp_carrier = carrier_count_summed_df[carrier_count_summed_df.z_cutoff == z_cutoff][f\"{sv_type}_misexp\"].item()\n",
    "            cntrl_carrier = carrier_count_summed_df[carrier_count_summed_df.z_cutoff == z_cutoff][f\"{sv_type}_contrl\"].item()\n",
    "            conting_mtx_info = [misexp_carrier, total_misexp, cntrl_carrier, total_control]\n",
    "            sv_10kb_enrich_results_dict[sv_10kb_enrich_count] = [f\"{sv_type}\", \"all\", af_range_name, z_cutoff, name, window_raw, window_name] + conting_mtx_info\n",
    "            sv_10kb_enrich_count += 1 \n",
    "            # gene-level most severe consequences stratified by SV type \n",
    "            if sv_type == \"all_sv\": \n",
    "                continue\n",
    "            for msc in vep_msc_ranks: \n",
    "                misexp_carrier = carrier_count_summed_df[carrier_count_summed_df.z_cutoff == z_cutoff][f\"{sv_type}_{msc}_misexp\"].item()\n",
    "                cntrl_carrier = carrier_count_summed_df[carrier_count_summed_df.z_cutoff == z_cutoff][f\"{sv_type}_{msc}_contrl\"].item()\n",
    "                conting_mtx_info = [misexp_carrier, total_misexp, cntrl_carrier, total_control]\n",
    "                sv_10kb_enrich_results_dict[sv_10kb_enrich_count] = [f\"{sv_type}\", f\"{msc}\", af_range_name, z_cutoff, name, window_raw, window_name] + conting_mtx_info\n",
    "                sv_10kb_enrich_count += 1 "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "10033f59",
   "metadata": {},
   "outputs": [],
   "source": [
    "sv_cols = [\"vrnt_type\", \"consequence\", \"maf_range\", \"z_cutoff\", \"z_cutoff_name\", \"window_raw\", \"window_name\"]\n",
    "sv_enrich_cols = sv_cols + carrier_count_cols\n",
    "sv_10kb_enrich_results_df = pd.DataFrame.from_dict(sv_10kb_enrich_results_dict, orient=\"index\", columns=sv_enrich_cols)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "00adc678",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Percentage misexpression events with rare SV 2: 0.37974683544303794% (66/17380)\n",
      "Percentage misexpression events with rare SV 10: 0.5061867266591676% (54/10668)\n",
      "Percentage misexpression events with rare SV 20: 0.6923409779316313% (32/4622)\n",
      "Percentage misexpression events with rare SV 30: 1.188707280832095% (24/2019)\n",
      "Percentage misexpression events with rare SV 40: 2.3661270236612704% (19/803)\n"
     ]
    }
   ],
   "source": [
    "# proportion of misexpression events with a rare SV\n",
    "perc_misexp_sv_carrier_10kb = {}\n",
    "for i, z_score in enumerate([2, 10, 20, 30, 40]):\n",
    "    misexp_cntrl_carrier_df = sv_10kb_enrich_results_df[(sv_10kb_enrich_results_df.vrnt_type == \"all_sv\") & \n",
    "                                                      (sv_10kb_enrich_results_df.maf_range == \"0-1\") & \n",
    "                                                      (sv_10kb_enrich_results_df.consequence == \"all\") & \n",
    "                                                      (sv_10kb_enrich_results_df.window_raw == \"gene_body_10\") &\n",
    "                                                      (sv_10kb_enrich_results_df.z_cutoff == z_score)\n",
    "                                                     ]\n",
    "    misexp_carrier = misexp_cntrl_carrier_df.misexp_carrier.item()\n",
    "    misexp_total = misexp_cntrl_carrier_df.misexp_total.item()\n",
    "    perc_carrier = (misexp_carrier/misexp_total)*100\n",
    "    perc_misexp_sv_carrier_10kb[i] = [f\"> {z_score}\", perc_carrier, misexp_carrier, misexp_total]\n",
    "    print(f\"Percentage misexpression events with rare SV {z_score}: {(misexp_carrier/misexp_total)*100}% ({misexp_carrier}/{misexp_total})\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "3ed63c80",
   "metadata": {},
   "outputs": [],
   "source": [
    "# write carrier % to file \n",
    "perc_misexp_sv_carrier_cols = [\"z-score\", \"perc_carrier\", \"misexp_carrier\", \"misexp_total\"]\n",
    "perc_misexp_sv_carrier_10kb_df = pd.DataFrame.from_dict(perc_misexp_sv_carrier_10kb, orient=\"index\", columns=perc_misexp_sv_carrier_cols)\n",
    "# write to file\n",
    "perc_misexp_sv_carrier_10kb_path = out_dir.joinpath(\"perc_misexp_carrier_zscores_10kb.tsv\")\n",
    "perc_misexp_sv_carrier_10kb_df.to_csv(perc_misexp_sv_carrier_10kb_path, sep=\"\\t\", index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "bb82a5cf",
   "metadata": {},
   "source": [
    "### SV gene body carrier counts +/- 200kb"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "900540f9",
   "metadata": {},
   "outputs": [],
   "source": [
    "sv_carrier_count_200kb_gene_body_path = Path(sv_carrier_count_200kb_gene_body_dir)\n",
    "# combine control and misexpression count files from chromosomes \n",
    "carrier_count_df_list = []\n",
    "for chrom in CHROMOSOMES[:22]: \n",
    "    carrier_count_path = sv_carrier_count_200kb_gene_body_dir.joinpath(f\"{chrom}_carrier_count_gene_msc.tsv\")\n",
    "    carrier_count_df = pd.read_csv(carrier_count_path, sep=\"\\t\")\n",
    "    carrier_count_df_list.append(carrier_count_df)\n",
    "all_chrom_carrier_count_200kb_gene_body_df = pd.concat(carrier_count_df_list).drop(columns=[\"smpls_pass_qc\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "5d1a202d",
   "metadata": {},
   "outputs": [],
   "source": [
    "window_raw = \"gene_body_200\"\n",
    "window_name = \"gene body +/-200kb\"\n",
    "sv_200kb_enrich_results_dict = {}\n",
    "sv_200kb_enrich_count = 0\n",
    "af_range_list = [[0,1], [1, 5], [5, 10], [10,50]]\n",
    "\n",
    "for af_range in af_range_list: \n",
    "    af_lower, af_upper = af_range\n",
    "    af_range_name = f\"{af_lower}-{af_upper}\"\n",
    "    carrier_count_maf_df = all_chrom_carrier_count_200kb_gene_body_df[all_chrom_carrier_count_200kb_gene_body_df.maf_bin == af_range_name]\n",
    "    # count all events for chromosomes \n",
    "    carrier_count_summed_df = carrier_count_maf_df.groupby(by=[\"z_cutoff\"], as_index=False).sum()\n",
    "    # calculate odds and risk ratio with confidence intervals \n",
    "    for z_cutoff in carrier_count_summed_df.z_cutoff:\n",
    "        name = f\"> {round(z_cutoff)}\"\n",
    "        # total misexpression and control events \n",
    "        total_misexp = carrier_count_summed_df[carrier_count_summed_df.z_cutoff == z_cutoff][f\"total_misexp\"].item()\n",
    "        total_control = carrier_count_summed_df[carrier_count_summed_df.z_cutoff == z_cutoff][f\"total_control\"].item()\n",
    "        for sv_type in all_sv_type_list: \n",
    "            # all SVs and different SV classes \n",
    "            misexp_carrier = carrier_count_summed_df[carrier_count_summed_df.z_cutoff == z_cutoff][f\"{sv_type}_misexp\"].item()\n",
    "            cntrl_carrier = carrier_count_summed_df[carrier_count_summed_df.z_cutoff == z_cutoff][f\"{sv_type}_contrl\"].item()\n",
    "            conting_mtx_info = [misexp_carrier, total_misexp, cntrl_carrier, total_control]\n",
    "            sv_200kb_enrich_results_dict[sv_200kb_enrich_count] = [f\"{sv_type}\", \"all\", af_range_name, z_cutoff, name, window_raw, window_name] + conting_mtx_info\n",
    "            sv_200kb_enrich_count += 1 \n",
    "            # gene-level most severe consequences stratified by SV type \n",
    "            if sv_type == \"all_sv\": \n",
    "                continue\n",
    "            for msc in vep_msc_ranks: \n",
    "                misexp_carrier = carrier_count_summed_df[carrier_count_summed_df.z_cutoff == z_cutoff][f\"{sv_type}_{msc}_misexp\"].item()\n",
    "                cntrl_carrier = carrier_count_summed_df[carrier_count_summed_df.z_cutoff == z_cutoff][f\"{sv_type}_{msc}_contrl\"].item()\n",
    "                conting_mtx_info = [misexp_carrier, total_misexp, cntrl_carrier, total_control]\n",
    "                sv_200kb_enrich_results_dict[sv_200kb_enrich_count] = [f\"{sv_type}\", f\"{msc}\", af_range_name, z_cutoff, name, window_raw, window_name] + conting_mtx_info\n",
    "                sv_200kb_enrich_count += 1 "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "fde7de3e",
   "metadata": {},
   "outputs": [],
   "source": [
    "sv_cols = [\"vrnt_type\", \"consequence\", \"maf_range\", \"z_cutoff\", \"z_cutoff_name\", \"window_raw\", \"window_name\"]\n",
    "sv_enrich_cols = sv_cols + carrier_count_cols\n",
    "sv_200kb_enrich_results_df = pd.DataFrame.from_dict(sv_200kb_enrich_results_dict, orient=\"index\", columns=sv_enrich_cols)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "32dc839f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Percentage misexpression events with rare SV 2: 1.3176064441887225% (229/17380)\n",
      "Percentage misexpression events with rare SV 10: 1.527934008248969% (163/10668)\n",
      "Percentage misexpression events with rare SV 20: 1.8390307226308955% (85/4622)\n",
      "Percentage misexpression events with rare SV 30: 2.4764735017335315% (50/2019)\n",
      "Percentage misexpression events with rare SV 40: 4.732254047322541% (38/803)\n"
     ]
    }
   ],
   "source": [
    "# proportion of misexpression events with a rare SV\n",
    "perc_misexp_sv_carrier_200kb = {}\n",
    "for i, z_score in enumerate([2, 10, 20, 30, 40]):\n",
    "    misexp_cntrl_carrier_df = sv_200kb_enrich_results_df[(sv_200kb_enrich_results_df.vrnt_type == \"all_sv\") & \n",
    "                                                      (sv_200kb_enrich_results_df.maf_range == \"0-1\") & \n",
    "                                                      (sv_200kb_enrich_results_df.consequence == \"all\") & \n",
    "                                                      (sv_200kb_enrich_results_df.window_raw == \"gene_body_200\") &\n",
    "                                                      (sv_200kb_enrich_results_df.z_cutoff == z_score)\n",
    "                                                     ]\n",
    "    misexp_carrier = misexp_cntrl_carrier_df.misexp_carrier.item()\n",
    "    misexp_total = misexp_cntrl_carrier_df.misexp_total.item()\n",
    "    perc_carrier = (misexp_carrier/misexp_total)*100\n",
    "    perc_misexp_sv_carrier_200kb[i] = [f\"> {z_score}\", perc_carrier, misexp_carrier, misexp_total]\n",
    "    print(f\"Percentage misexpression events with rare SV {z_score}: {(misexp_carrier/misexp_total)*100}% ({misexp_carrier}/{misexp_total})\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "12a3d5d1",
   "metadata": {},
   "outputs": [],
   "source": [
    "# write carrier % to file \n",
    "perc_misexp_sv_carrier_200kb_df = pd.DataFrame.from_dict(perc_misexp_sv_carrier_200kb, orient=\"index\", columns=perc_misexp_sv_carrier_cols)\n",
    "# write to file\n",
    "perc_misexp_sv_carrier_200kb_path = out_dir.joinpath(\"perc_misexp_carrier_zscores_200kb.tsv\")\n",
    "perc_misexp_sv_carrier_200kb_df.to_csv(perc_misexp_sv_carrier_200kb_path, sep=\"\\t\", index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "de8ef013",
   "metadata": {},
   "source": [
    "### SV windows carrier count"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "d11f8d7e",
   "metadata": {},
   "outputs": [],
   "source": [
    "window_names_dict = {\"downstream_0\": \"gene body\",\n",
    "                     \"upstream_0\": \"gene body\",\n",
    "                     \"gene_body\": \"gene body\",\n",
    "                     \"gene_body_window_10000\": \"gene body +/-10kb\",\n",
    "                     \"downstream_1000000\": \"800kb to 1Mb\",\n",
    "                     \"downstream_200000\": \"TTS to 200kb\",\n",
    "                     \"downstream_400000\": \"200kb to 400kb\",\n",
    "                     \"downstream_600000\": \"400kb to 600kb\",\n",
    "                     \"downstream_800000\": \"600kb to 800kb\",\n",
    "                     \"upstream_1000000\": \"-800kb to -1Mb\",\n",
    "                     \"upstream_200000\": \"TSS to -200kb\",\n",
    "                     \"upstream_400000\": \"-200kb to -400kb\",\n",
    "                     \"upstream_600000\": \"-400kb to -600kb\",\n",
    "                     \"upstream_800000\": \"-600kb to -800kb\",\n",
    "                    }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "f6eea11f",
   "metadata": {},
   "outputs": [],
   "source": [
    "sv_carrier_count_windows_path = Path(sv_carrier_count_windows_dir)\n",
    "# combine control and misexpression count files from chromosomes \n",
    "sv_windows_enrich_results_dict = {}\n",
    "sv_window_count = 0\n",
    "af_range_list = [[0,1]]\n",
    "for af_range in af_range_list: \n",
    "    af_lower, af_upper = af_range\n",
    "    af_range_name = f\"{af_lower}-{af_upper}\"\n",
    "    carrier_count_df_list = []\n",
    "    for chrom in CHROMOSOMES[:22]: \n",
    "        carrier_count_path = sv_carrier_count_windows_path.joinpath(f\"{chrom}_carrier_count.tsv\")\n",
    "        carrier_count_df = pd.read_csv(carrier_count_path, sep=\"\\t\")\n",
    "        carrier_count_df_list.append(carrier_count_df)\n",
    "    all_chrom_carrier_count_df = pd.concat(carrier_count_df_list).drop(columns=[\"smpls_pass_qc\"])\n",
    "    # count all events for chromosomes \n",
    "    all_chrom_carrier_count_df[\"name\"] = all_chrom_carrier_count_df.z_cutoff.astype(str) + \"_\" + all_chrom_carrier_count_df.direction + \"_\" + all_chrom_carrier_count_df.window_size.astype(str)\n",
    "    all_chrom_carrier_count_df = all_chrom_carrier_count_df.drop(columns=[\"z_cutoff\", \"direction\", \"window_size\"])\n",
    "    carrier_count_summed_df = all_chrom_carrier_count_df.groupby([\"name\"]).sum()\n",
    "    all_enrich_results = {}\n",
    "    \n",
    "    for window in carrier_count_summed_df.index:\n",
    "        z_cutoff = float(window.split(\"_\")[0])\n",
    "        name = f\"> {round(z_cutoff)}\"\n",
    "        window_raw = window.split(\"_\")[1] + \"_\" + window.split(\"_\")[2]\n",
    "        window_name = window_names_dict[window_raw]\n",
    "        total_misexp = carrier_count_summed_df[carrier_count_summed_df.index == window][f\"total_misexp\"].item()\n",
    "        total_control = carrier_count_summed_df[carrier_count_summed_df.index == window][f\"total_control\"].item()\n",
    "        for sv_type in all_sv_type_list: \n",
    "            misexp_carrier = carrier_count_summed_df[carrier_count_summed_df.index == window][f\"{sv_type}_misexp\"].item()\n",
    "            cntrl_carrier = carrier_count_summed_df[carrier_count_summed_df.index == window][f\"{sv_type}_contrl\"].item()\n",
    "            conting_mtx_info = [misexp_carrier, total_misexp, cntrl_carrier, total_control]\n",
    "            sv_windows_enrich_results_dict[sv_window_count] = [f\"{sv_type}\", \"all\", af_range_name, z_cutoff, name, window_raw, window_name] + conting_mtx_info\n",
    "            sv_window_count += 1 \n",
    "                \n",
    "sv_window_enrich_results_df = pd.DataFrame.from_dict(sv_windows_enrich_results_dict, orient=\"index\", columns=sv_enrich_cols)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "de7fed64",
   "metadata": {},
   "outputs": [],
   "source": [
    "# drop duplicate results \n",
    "sv_window_enrich_results_no_dupl_df = sv_window_enrich_results_df[sv_window_enrich_results_df.window_raw != \"downstream_0\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "7ebd5ccd",
   "metadata": {},
   "outputs": [],
   "source": [
    "# combine SV results \n",
    "sv_combined_results_df = pd.concat([sv_10kb_enrich_results_df, \n",
    "                                    sv_200kb_enrich_results_df, \n",
    "                                    sv_window_enrich_results_no_dupl_df, \n",
    "                                   ])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1351f592",
   "metadata": {},
   "source": [
    "### SNVs and indels carrier counts "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "7e6e94b8",
   "metadata": {},
   "outputs": [],
   "source": [
    "snp_indel_root_path = Path(snp_indel_root_dir)\n",
    "snp_indel_carrier_count_df_list = []\n",
    "for chrom in CHROMOSOMES[:22]:\n",
    "    carrier_count_chrom_path = snp_indel_root_path.joinpath(f\"{chrom}_carrier_count.tsv\")\n",
    "    carrier_count_chrom_df = pd.read_csv(carrier_count_chrom_path, sep=\"\\t\")\n",
    "    snp_indel_carrier_count_df_list.append(carrier_count_chrom_df)\n",
    "all_chrom_snp_indel_carrier_count_df = pd.concat(snp_indel_carrier_count_df_list)\n",
    "all_chrom_snp_indel_carrier_sum_df = all_chrom_snp_indel_carrier_count_df.groupby(by=[\"z_cutoff\", \"window\", \"vrnt_type\", \"maf_range\"], as_index=False).sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "id": "f6f8bab2",
   "metadata": {},
   "outputs": [],
   "source": [
    "# SNV and indel gene body +/-10kb test\n",
    "snp_indel_10kb_set_df = all_chrom_snp_indel_carrier_sum_df[(all_chrom_snp_indel_carrier_sum_df.window == 'gene_body_window_10000')].copy()\n",
    "# SNV and indel windows, rare variants, all TPM \n",
    "snp_indel_window_set_df = all_chrom_snp_indel_carrier_sum_df[(all_chrom_snp_indel_carrier_sum_df.maf_range == \"0-0.01\") &\n",
    "                                                             (all_chrom_snp_indel_carrier_sum_df.window != \"gene_body_window_10000\")].copy()\n",
    "# combine\n",
    "snp_indel_all_results_df = pd.concat([snp_indel_10kb_set_df, snp_indel_window_set_df])\n",
    "# add consequence \n",
    "snp_indel_all_results_df[\"consequence\"] = \"all\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "id": "572546b6",
   "metadata": {},
   "outputs": [],
   "source": [
    "# rename MAF ranges\n",
    "rename_maf_range={\"0-0.01\": \"0-1\", \"0.01-0.05\": \"1-5\", \n",
    "                  \"0.05-0.1\": \"5-10\", \"0.1-0.5\": \"10-50\"}\n",
    "snp_indel_all_results_df[\"maf_range\"]  = snp_indel_all_results_df[\"maf_range\"].replace(rename_maf_range)\n",
    "# rename TPM cutoffs \n",
    "snp_indel_all_results_df[\"z_cutoff_name\"]  = \"> \" + snp_indel_all_results_df[\"z_cutoff\"].astype(str)\n",
    "# window naming \n",
    "snp_indel_all_results_df[\"window_name\"]  = snp_indel_all_results_df[\"window\"].replace(window_names_dict)\n",
    "snp_indel_all_results_df = snp_indel_all_results_df.rename(columns={\"window\": \"window_raw\"})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "id": "dcd8529e",
   "metadata": {},
   "outputs": [],
   "source": [
    "# combine SV and SNV/indel carrier count results \n",
    "snp_indel_sv_results_df = pd.concat([sv_combined_results_df, snp_indel_all_results_df])\n",
    "snp_indel_sv_results_df[\"consequence_name\"] = snp_indel_sv_results_df.consequence.str.split(\"_\").str.join(\" \").str.capitalize()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "id": "b21c9ae9",
   "metadata": {},
   "outputs": [],
   "source": [
    "# write all results \n",
    "snp_indel_sv_results_df.to_csv(out_dir.joinpath(\"snp_indel_sv_all_carrier_count_z_cutoff.tsv\"), sep=\"\\t\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cf339cc5",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
